TO DO:

  1. Create a file such as per_read_modified_base_calls.txt but with binary methylation calls using the default cut-offs, i.e., filter out probabilities >20% and <80%, like the bedMethyl files but not aggregated.
  2. Use this file to calculate autocorrelation between the binary calls on single reads.
  3. See if/how we can aggregate the autocorrelation across reads to identify relevant autocorrelation lags, if any.
  4. Aggregate methylation calls between dyads and proceed doing the same but at the between-dyad level.
  5. Check if autocorrelation on observed data is different as compared to permuted data to assess correlations.
# see here (https://nanoporetech.github.io/megalodon/file_formats.html) for variable names
library(data.table)
binaryCalls <- fread(file = "/Users/koenvandenberge/data/molly/megalodon_output_06/per_read_modified_base_calls_binary.txt.gz",
                     nrows = 1e5)
colnames(binaryCalls) <- c("readID", "chr", "strand", "pos",
                           "mod_log_prob", "can_log_prob", "mod_base")
binaryCalls$methylation <- ifelse(binaryCalls$mod_log_prob > -0.2231436, 1, 0)
table(binaryCalls$methylation)
## 
##     0     1 
## 89988 10012

EDA

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.2     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   1.0.3
## ✔ tidyr   1.0.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()   masks data.table::between()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::first()     masks data.table::first()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::last()      masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
readCalls <- binaryCalls %>% 
  group_by(readID) %>%
  summarize(avgMeth = mean(methylation),
            length = n())
hist(readCalls$avgMeth, breaks=20)

logit <- function(x) log(x / (1-x))
plot(x=readCalls$length, y=logit(readCalls$avgMeth), log="x",
     xlab = "Number of bases with confident methylation calls in a read",
     ylab = "Logit average methylation")

Autocorrelation

Below we look at autocorrelation for random reads. It is clear that there is strong autocorrelation. This is very obvious when comparing the autocorrelation function of the observed data versus that of permuted reads.

Note the interesting observation that the ACF function appears bimodal. Clould this be related to nucleosome positions? Since the spacing is unequal across reads, does the multi-modality relate to a particular genomic distance?

For random reads

rafalib::mypar(mfrow=c(4,4),
              mar=c(1,2.5,3.2,1))
#for(rr in 1:27){
for(rr in 100:127){
  curAvgMeth <- readCalls$avgMeth[rr]
  if(curAvgMeth >0 & curAvgMeth < 1){
    rid <- readCalls$readID[rr]
    curDf <- binaryCalls[binaryCalls$readID == rid,]
    curRange <- range(curDf$pos)
    curChr <- unique(curDf$chr)
    acf(curDf$methylation, ylim=c(0,1), xlim=c(0,30),
        main=paste0("chr", curChr,": ", round(curRange[1]/1e3, digits = 2),
                    " - ", round(curRange[2]/1e3, digits = 2), "kb"))
  } else next
}

Understand autocorrelation plots by implementing them myself

rafalib::mypar(mfrow=c(4,4),
              mar=c(1,2.5,3.2,1))
#for(rr in 1:27){
for(rr in 100:127){
  curAvgMeth <- readCalls$avgMeth[rr]
  if(curAvgMeth >0 & curAvgMeth < 1){
    rid <- readCalls$readID[rr]
    curDf <- binaryCalls[binaryCalls$readID == rid,]
    curRange <- range(curDf$pos)
    curChr <- unique(curDf$chr)
    y <- curDf$methylation
    y <- y - mean(y)
    maxLag <- 30
    n <- length(y)
    acf <- c()
    for(ll in 1:maxLag){
      acf[ll] <- cor(y[1:(n-ll)], y[(ll+1):n])
    }
    plot(x=1:maxLag, y=acf, type='h', ylim=c(0,1),
         main=paste0("chr", curChr,": ", round(curRange[1]/1e3, digits = 2),
                    " - ", round(curRange[2]/1e3, digits = 2), "kb"))
  } else next
}
## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

## Warning in cor(y[1:(n - ll)], y[(ll + 1):n]): the standard deviation is zero

Autocorrelation function on permutations of random reads

rafalib::mypar(mfrow=c(4,4))
for(rr in 1:27){
  curAvgMeth <- readCalls$avgMeth[rr]
  if(curAvgMeth >0 & curAvgMeth < 1){
    acf(sample(binaryCalls$methylation[binaryCalls$readID == readCalls$readID[rr]]),
        ylim=c(0,1))
  } else next
}

Autocorrelation at relevant scales: considering linker regions

There is high autocorrelation on the methylation signal on single reads, at the level of single methylated adenine nucleotides. Note that, this could however be expected. Indeed, Molly suggested that the actual biologically relevant resolution might be the resolution of linker regions inbetween nucleosomes. In order to identify these linker regions, we could use the nucleosome occupancy data from Chereji et al. (2018) and, for example, set a threshold on when a region is ‘not occupied by nucleosomes’, and then calculating autocorrelation between those regions.

CHECK: Does conditioning matter? Does correlation change when looking at (a) a position close to the silencer regions with a position further out in the HMR locus, versus (b) a position in the middle of the HMR locus with a position close to the silencer region? It could be that, for example, if the position close to the silencer region is methylated, there’s a reasonable probability that a position in the middle of the locus is also methylated. However, the reverse may be less clear. It’s possible that if the position in the middle of the locus is not methylated, the position close to the silencer region will never be methylated, for example.

Import aggreated methylation bed file

meth <- fread("~/data/molly/modified_bases.6mA.bed")
colnames(meth) <- c("chrom", "start", "end", "name", "score", 
                    "strand", "startCodon", "stopCodon", "color", 
                    "coverage", "percentage")

meth_cols <- c("chrom", "start", "coverage", "percentage")
filtered_meth <- meth[coverage > 10, ..meth_cols]

Import hereji et al 2018 nucleosome occupancy data

nucs <- fread("~/data/molly/Chereji_Occupancy_rep1_singlebp.bedGraph")
colnames(nucs) <- c("chrom", "start", "end", "occupancy")

#plot percentage methylation as a column chart
HMR <- filtered_meth[chrom == "III" & start > 291e3 & start < 294e3]
HMR_nucs <- nucs[chrom == "chrIII" & start > 291e3 & start < 294e3]

ggplot() +
  geom_col(HMR, mapping = aes(x = start, y = percentage), color = "mediumpurple4") +
  geom_col(HMR_nucs, mapping = aes(x = start, y = occupancy * 20), color = "black") +
  theme_minimal() +
  labs(title = "SIR3-EcoGII (JRY13027)",
       x = "position on chr III",
       y = "% methylated reads")

plot(x=HMR$start, y=rep(-3,3,length.out=nrow(HMR)), 
     type="n", ylim=c(-3,3), 
     ylab="Scaled average methylation / occupancy",
     xlab="Chromosome position")
# Methylation
loMethyl <- loess(percentage/100 ~ start, data=HMR, weights=HMR$coverage, enp.target = 50)
lines(x=loMethyl$x[order(loMethyl$x)], y=scale(loMethyl$fitted[order(loMethyl$x)]), 
      col="orange", lwd=4)
# Nucleosome occupancy
loOccup <- loess(occupancy/max(occupancy) ~ start, data=HMR_nucs, enp.target = 50)
lines(x=loOccup$x, y=scale(loOccup$fitted), col="darkseagreen3", lwd=4)
legend("topleft", c("Methylation", "Nucleosome occupancy"), 
       col=c("orange", "darkseagreen3"), lty=1, cex=1,
       bty='n', lwd=4)

Define linker regions by thresholding nucleosome occupancy data on HMR region

# Set arbitrary threshold for defining linnker regions
plot(HMR_nucs$start, HMR_nucs$occupancy, pch=16, cex=1/3)
abline(h=0.5, lty=2, col="red")

# Aggregate methylation calls at the level of linker regions
FtoT <- which(diff(HMR_nucs$occupancy < 1/2) == 1)
TtoF <-  which(diff(HMR_nucs$occupancy < 1/2) == -1)
dfLink <- data.frame(start = FtoT,
                     end = TtoF)
# only keep linker regions with at least 5bp in size
dfLink <- dfLink[(dfLink$end - dfLink$start) >=5,]

plot(HMR_nucs$start, HMR_nucs$occupancy, pch=16, cex=1/3)
abline(h=0.5, lty=2, col="red")
abline(v=HMR_nucs$start[dfLink$start], col="blue")
abline(v=HMR_nucs$start[dfLink$end], col="green")

binaryCalls <- fread(file = "/Users/koenvandenberge/data/molly/megalodon_output_06/per_read_modified_base_calls_binary.txt.gz",
                     nrows = 6e6, skip=11e6)
colnames(binaryCalls) <- c("readID", "chr", "strand", "pos",
                           "mod_log_prob", "can_log_prob", "mod_base")
table(binaryCalls$chr)
## 
##      II     III      IV 
## 1606661 3632511  760828
binaryCalls$methylation <- ifelse(binaryCalls$mod_log_prob > -0.2231436, 1, 0)
binaryCalls3 <- binaryCalls[binaryCalls$chr == "III",]

## organize reads
readInfo <- binaryCalls3 %>%
  group_by(readID) %>%
  summarize(start=min(pos),
            end=max(pos),
            chr=unique(chr))

startLink <- HMR_nucs$start[dfLink$start[1]]
endLink <- HMR_nucs$start[dfLink$end[nrow(dfLink)]]
# overlappingReads <- readInfo[(readInfo$end >= 291e3 & readInfo$start <=294e3) | (readInfo$start < 294e3 & readInfo$start > 290e3),]
overlappingReads <- readInfo[(readInfo$end >= 291e3 & readInfo$start <=294e3),]

readAvMethList <- list()
for(rr in 1:nrow(overlappingReads)){
  curData <- binaryCalls3[binaryCalls3$readID == overlappingReads$readID[rr],]
  avMethLinkers <- c()
  for(bb in 1:nrow(dfLink)){
    curStart <- HMR_nucs$start[dfLink$start[bb]]
    curEnd <- HMR_nucs$end[dfLink$end[bb]]
    curId <- curData$pos >= curStart & curData$pos <= curEnd
    if(sum(curId) > 0){
      curBinData <- curData[curData$pos >= curStart & curData$pos <= curEnd,]
      avMethLinkers[bb] <- mean(curBinData$methylation)
      names(avMethLinkers)[bb] <- nrow(curBinData) #coverage
    } else {
      avMethLinkers[bb] <- NA
    }
  }
  readAvMethList[[rr]] <- avMethLinkers
}

avMethMat <- do.call(rbind, readAvMethList)
colSums(is.na(avMethMat)) # often no data in defined linker regions
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>    1    1 
##   34   19   37   21   54   86   27   31   28   27   49   42
# focus on 24 reads with data in all linker regions
readAvMethListComplete <- readAvMethList[rowSums(is.na(avMethMat)) == 0]
rafalib::mypar(mfrow=c(2,2))
for(ll in 1:length(readAvMethListComplete)){
  y <- readAvMethListComplete[[ll]]
  if(var(y)>0){
      acf(y)
  }
}

# aggregate acf across all reads
acfMat <- matrix(NA, nrow=length(readAvMethListComplete),
                 ncol=11)
rafalib::mypar(mfrow=c(1,1))
for(ll in 1:length(readAvMethListComplete)){
  y <- readAvMethListComplete[[ll]]
  if(var(y)>0){
    acfMat[ll,] <- acf(y, plot=FALSE)$acf[1:11,1,1]
  }
}
plot(x=0:10, y=colMeans(acfMat), type='h',
     xlab="Lag", ylab="Average ACF",
     main="ACF averaged across all complete reads")
abline(h=0, lty=2)

# # all reads
# rafalib::mypar(mfrow=c(4,4))
# for(ll in 1:length(readAvMethList)){
#   y <- readAvMethList[[ll]]
#   y <- y[!is.na(y)]
#   if(length(y) > 3 & var(y)>0){
#       acf(y)
#   }
# }

Define linker regions using dyad positions inferred by Rine Lab

nucDat <- openxlsx::read.xlsx("../../data/HMR_HML_nucleosome positions.xlsx")
nucDatHMR <- nucDat[nucDat$region == "HMR",]

overlappingReadsDyad <- readInfo[(readInfo$end >= min(nucDatHMR$dyad) & readInfo$start <= max(nucDatHMR$dyad)),]
dfLinkDyad <- data.frame(start = nucDatHMR$dyad[-nrow(nucDatHMR)]+1,
                         stop = nucDatHMR$dyad[2:nrow(nucDatHMR)])


readAvMethListDyad <- list()
for(rr in 1:nrow(overlappingReadsDyad)){
  curData <- binaryCalls3[binaryCalls3$readID == overlappingReadsDyad$readID[rr],]
  avMethLinkers <- c()
  for(bb in 1:nrow(dfLinkDyad)){
    curStart <- dfLinkDyad$start[bb]
    curEnd <- dfLinkDyad$stop[bb]
    curId <- curData$pos >= curStart & curData$pos <= curEnd
    if(sum(curId) > 0){
      curBinData <- curData[curData$pos >= curStart & curData$pos <= curEnd,]
      avMethLinkers[bb] <- mean(curBinData$methylation)
      names(avMethLinkers)[bb] <- nrow(curBinData) #coverage
    } else {
      avMethLinkers[bb] <- NA
    }
  }
  readAvMethListDyad[[rr]] <- avMethLinkers
}

avMethMatDyad <- do.call(rbind, readAvMethListDyad)
colSums(is.na(avMethMatDyad)) # often no data in defined linker regions
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>    1   37   24   23   30   38 
##   57   67   68   67   76   72   74   76   75   78   78   77   79   78   78   79 
##    2                                                             
##   78   79   70   73   72   63   63   61   63   68   70   66   69
# focus on 62 reads with data in all linker regions
readAvMethListDyadComplete <- readAvMethListDyad[rowSums(is.na(avMethMatDyad)) == 0]
rafalib::mypar(mfrow=c(2,2))
for(ll in 1:length(readAvMethListDyadComplete)){
  y <- readAvMethListDyadComplete[[ll]]
  if(var(y)>0){
      acf(y)
  }
}

# aggregate acf across all reads
acfMat <- matrix(NA, nrow=length(readAvMethListDyadComplete),
                 ncol=15)
rafalib::mypar(mfrow=c(1,1))

for(ll in 1:length(readAvMethListDyadComplete)){
  y <- readAvMethListDyadComplete[[ll]]
  if(var(y)>0){
    acfMat[ll,] <- acf(y, plot=FALSE)$acf[1:15,1,1]
  }
}
plot(x=0:14, y=colMeans(acfMat), type='h',
     xlab="Lag", ylab="Average ACF",
     main="ACF averaged across all complete reads")
abline(h=0, lty=2)

# when permuting the data
acfMatPermuted <- matrix(NA, nrow=length(readAvMethListDyadComplete),
                 ncol=15)
rafalib::mypar(mfrow=c(2,2))
for(ll in 1:length(readAvMethListDyadComplete)){
  y <- readAvMethListDyadComplete[[ll]]
  if(var(y)>0){
      acfMatPermuted[ll,] <- acf(sample(y), plot=FALSE)$acf[1:15,1,1]
  }
}
plot(x=0:14, y=colMeans(acfMatPermuted), type='h',
     xlab="Lag", ylab="Average ACF",
     main="Permuted: ACF averaged across all complete reads")
abline(h=0, lty=2)

Negative autocorrelation due to looking outside the HMR region?

Note that, as the lag of the autocorrelation increases, we are looking at ever shorter sequences that reside at the extremes of the HMR region. In the example above, we have a total of \(29\) linker regions which we identified using the dyad positions. However, if the outermost dyads are actually outside the HMR region, which is unmethylated, but the read within HMR is methylated, then this is what could have caused the negative autocorrelations we’re observing above.

Below, we therefore restrict to only looking within the HMR region.

nucDat <- openxlsx::read.xlsx("../../data/HMR_HML_nucleosome positions.xlsx")
nucDatHMR <- nucDat[nucDat$region == "HMR",]
## only look at linker regions within HMR silencer regions
## HMR E is at III:292674-292769
## HMR I is at III:294805-294864
nucDatHMR <- nucDatHMR[nucDatHMR$Start >= 292674,]
nucDatHMR <- nucDatHMR[nucDatHMR$Stop <= 294864,]
nrow(nucDatHMR) #only 9 linker regions
## [1] 9
overlappingReadsDyad <- readInfo[(readInfo$end >= min(nucDatHMR$dyad) & readInfo$start <= max(nucDatHMR$dyad)),]
dfLinkDyad <- data.frame(start = nucDatHMR$dyad[-nrow(nucDatHMR)]+1,
                         stop = nucDatHMR$dyad[2:nrow(nucDatHMR)])


readAvMethListDyad <- list()
for(rr in 1:nrow(overlappingReadsDyad)){
  curData <- binaryCalls3[binaryCalls3$readID == overlappingReadsDyad$readID[rr],]
  avMethLinkers <- c()
  for(bb in 1:nrow(dfLinkDyad)){
    curStart <- dfLinkDyad$start[bb]
    curEnd <- dfLinkDyad$stop[bb]
    curId <- curData$pos >= curStart & curData$pos <= curEnd
    if(sum(curId) > 0){
      curBinData <- curData[curData$pos >= curStart & curData$pos <= curEnd,]
      avMethLinkers[bb] <- mean(curBinData$methylation)
      names(avMethLinkers)[bb] <- nrow(curBinData) #coverage
    } else {
      avMethLinkers[bb] <- NA
    }
  }
  readAvMethListDyad[[rr]] <- avMethLinkers
}

avMethMatDyad <- do.call(rbind, readAvMethListDyad)
colSums(is.na(avMethMatDyad)) # often no data in defined linker regions
## <NA>    1   37   24   23   30   38    2 
##    8    8    7    9    8    8    9    8
# focus on 117 reads with data in all linker regions, looks pretty random...
readAvMethListDyadComplete <- readAvMethListDyad[rowSums(is.na(avMethMatDyad)) == 0]
rafalib::mypar(mfrow=c(2,2))
for(ll in 1:length(readAvMethListDyadComplete)){
  y <- readAvMethListDyadComplete[[ll]]
  if(var(y)>0){
      acf(y)
  }
}

rafalib::mypar(mfrow=c(1,2))

hist(unlist(readAvMethListDyadComplete), breaks=20,
     xlab="Avg methylatin in linker region",
     main="All reads spanning HMR")
avgMethRead <- unlist(lapply(readAvMethListDyadComplete, mean))
hist(avgMethRead, breaks=20,
     xlab="Avg methylatin of read",
     main="All reads spanning HMR")

# aggregate acf across all reads
acfMat <- matrix(NA, nrow=length(readAvMethListDyadComplete),
                 ncol=8)
rafalib::mypar(mfrow=c(1,2))
for(ll in 1:length(readAvMethListDyadComplete)){
  y <- readAvMethListDyadComplete[[ll]]
  if(var(y)>0){
    acfMat[ll,] <- acf(y, plot=FALSE)$acf[1:8,1,1]
  }
}
plot(x=0:7, y=colMeans(acfMat, na.rm=TRUE), type='h',
     xlab="Lag", ylab="Average ACF",
     main="Avg ACF")
abline(h=0, lty=2)


# when permuting the data
acfMatPermuted <- matrix(NA, nrow=length(readAvMethListDyadComplete),
                 ncol=8)
for(ll in 1:length(readAvMethListDyadComplete)){
  y <- readAvMethListDyadComplete[[ll]]
  if(var(y)>0){
      acfMatPermuted[ll,] <- acf(sample(y), plot=FALSE)$acf[1:8,1,1]
  }
}
plot(x=0:7, y=colMeans(acfMatPermuted, na.rm=TRUE), type='h',
     xlab="Lag", ylab="Average ACF",
     main="Permuted: Avg ACF",
     ylim=c(-0.2, 1))
abline(h=0, lty=2)

### By average methylation in the read
acfMatLow <- acfMat[avgMethRead <= 0.4,]
acfMatMed <- acfMat[avgMethRead > 0.4 & avgMethRead <= 0.6,]
acfMatHigh <- acfMat[avgMethRead > 0.6,]
rafalib::mypar(mfrow=c(1,3))
plot(x=0:7, y=colMeans(acfMatLow, na.rm=TRUE), type='h',
     xlab="Lag", ylab="Average ACF",
     main="Low methylation")
abline(h=0, lty=2)
plot(x=0:7, y=colMeans(acfMatMed, na.rm=TRUE), type='h',
     xlab="Lag", ylab="Average ACF",
     main="Medium methylation")
abline(h=0, lty=2)
plot(x=0:7, y=colMeans(acfMatHigh, na.rm=TRUE), type='h',
     xlab="Lag", ylab="Average ACF",
     main="High methylation")
abline(h=0, lty=2)

## Heatmap of co-methylation
library(pheatmap)
avMethMat <- do.call(rbind, readAvMethListDyadComplete)
pheatmap(cor(avMethMat), cluster_rows=FALSE, cluster_cols=FALSE)

avMethMatCentered <- avMethMat - rowMeans(avMethMat)
pheatmap(cor(avMethMatCentered), cluster_rows=FALSE, cluster_cols=FALSE)

Start from locus in the middle of HMR

nucDat <- openxlsx::read.xlsx("../../data/HMR_HML_nucleosome positions.xlsx")
nucDatHMR <- nucDat[nucDat$region == "HMR",]
## only look at linker regions within HMR silencer regions
## HMR E is at III:292674-292769
## HMR I is at III:294805-294864
nucDatHMR <- nucDatHMR[nucDatHMR$Start >= 293200,]
nucDatHMR <- nucDatHMR[nucDatHMR$Stop <= 294864,]
nrow(nucDatHMR) #only 9 linker regions
## [1] 6
overlappingReadsDyad <- readInfo[(readInfo$end >= min(nucDatHMR$dyad) & readInfo$start <= max(nucDatHMR$dyad)),]
dfLinkDyad <- data.frame(start = nucDatHMR$dyad[-nrow(nucDatHMR)]+1,
                         stop = nucDatHMR$dyad[2:nrow(nucDatHMR)])


readAvMethListDyad <- list()
for(rr in 1:nrow(overlappingReadsDyad)){
  curData <- binaryCalls3[binaryCalls3$readID == overlappingReadsDyad$readID[rr],]
  avMethLinkers <- c()
  for(bb in 1:nrow(dfLinkDyad)){
    curStart <- dfLinkDyad$start[bb]
    curEnd <- dfLinkDyad$stop[bb]
    curId <- curData$pos >= curStart & curData$pos <= curEnd
    if(sum(curId) > 0){
      curBinData <- curData[curData$pos >= curStart & curData$pos <= curEnd,]
      avMethLinkers[bb] <- mean(curBinData$methylation)
      names(avMethLinkers)[bb] <- nrow(curBinData) #coverage
    } else {
      avMethLinkers[bb] <- NA
    }
  }
  readAvMethListDyad[[rr]] <- avMethLinkers
}

avMethMatDyad <- do.call(rbind, readAvMethListDyad)
colSums(is.na(avMethMatDyad)) # often no data in defined linker regions
## 24 23 30 38  2 
##  5  4  4  5  4
# focus on 117 reads with data in all linker regions, looks pretty random...
readAvMethListDyadComplete <- readAvMethListDyad[rowSums(is.na(avMethMatDyad)) == 0]
rafalib::mypar(mfrow=c(2,2))
for(ll in 1:length(readAvMethListDyadComplete)){
  y <- readAvMethListDyadComplete[[ll]]
  if(var(y)>0){
      acf(y)
  }
}

hist(unlist(readAvMethListDyadComplete), breaks=20,
     xlab="Avg methylatin in linker region",
     main="All reads spanning HMR")
avgMethRead <- unlist(lapply(readAvMethListDyadComplete, mean))
hist(avgMethRead, breaks=20,
     xlab="Avg methylatin of read",
     main="All reads spanning HMR")

# aggregate acf across all reads
acfMat <- matrix(NA, nrow=length(readAvMethListDyadComplete),
                 ncol=nrow(nucDatHMR)-1)
rafalib::mypar(mfrow=c(1,2))

for(ll in 1:length(readAvMethListDyadComplete)){
  y <- readAvMethListDyadComplete[[ll]]
  if(var(y)>0){
    acfMat[ll,] <- acf(y, plot=FALSE)$acf[1:(nrow(nucDatHMR)-1),1,1]
  }
}
plot(x=0:(nrow(nucDatHMR)-2), y=colMeans(acfMat, na.rm=TRUE), type='h',
     xlab="Lag", ylab="Average ACF",
     main="Avg ACF")
abline(h=0, lty=2)


# when permuting the data
acfMatPermuted <- matrix(NA, nrow=length(readAvMethListDyadComplete),
                 ncol=(nrow(nucDatHMR)-1))
for(ll in 1:length(readAvMethListDyadComplete)){
  y <- readAvMethListDyadComplete[[ll]]
  if(var(y)>0){
      acfMatPermuted[ll,] <- acf(sample(y), plot=FALSE)$acf[1:(nrow(nucDatHMR)-1),1,1]
  }
}
plot(x=0:(nrow(nucDatHMR)-2), y=colMeans(acfMatPermuted, na.rm=TRUE), type='h',
     xlab="Lag", ylab="Average ACF",
     main="Permuted: Avg ACF",
     ylim=c(-0.2, 1))
abline(h=0, lty=2)

### By average methylation in the read
acfMatLow <- acfMat[avgMethRead <= 0.4,]
acfMatMed <- acfMat[avgMethRead > 0.4 & avgMethRead <= 0.6,]
acfMatHigh <- acfMat[avgMethRead > 0.6,]
rafalib::mypar(mfrow=c(1,3))
plot(x=0:(nrow(nucDatHMR)-2), y=colMeans(acfMatLow, na.rm=TRUE), type='h',
     xlab="Lag", ylab="Average ACF",
     main="Low methylation")
abline(h=0, lty=2)
plot(x=0:(nrow(nucDatHMR)-2), y=colMeans(acfMatMed, na.rm=TRUE), type='h',
     xlab="Lag", ylab="Average ACF",
     main="Medium methylation")
abline(h=0, lty=2)
plot(x=0:(nrow(nucDatHMR)-2), y=colMeans(acfMatHigh, na.rm=TRUE), type='h',
     xlab="Lag", ylab="Average ACF",
     main="High methylation")
abline(h=0, lty=2)